## (A) Conditional independence

# Independence ==> Uncorrelated
# Correlated == > Dependence

cor.test_serieA <- c(cor.test(score1_serieA, score2_serieA, alternative="two.sided", method = "pearson")$p.value, cor.test(score1_serieA, score2_serieA, alternative="two.sided", method = "spearman", exact = FALSE)$p.value, cor.test(score1_serieA, score2_serieA, alternative="two.sided", method = "kendall", exact = FALSE)$p.value)

cor.test_Liga <- c(cor.test(score1_Liga, score2_Liga, alternative="two.sided", method = "pearson")$p.value, cor.test(score1_Liga, score2_Liga, alternative="two.sided", method = "spearman", exact= FALSE)$p.value, cor.test(score1_Liga, score2_Liga, alternative="two.sided", method = "kendall", exact= FALSE)$p.value)

cor.test_PL <- c(cor.test(score1_PL, score2_PL, alternative="two.sided", method = "pearson")$p.value, cor.test(score1_PL, score2_PL, alternative="two.sided", method = "spearman", exact= FALSE)$p.value, cor.test(score1_PL, score2_PL, alternative="two.sided", method = "kendall", exact= FALSE)$p.value)

cor.test_BundesLiga <- c(cor.test(score1_BundesLiga, score2_BundesLiga, alternative="two.sided", method = "pearson")$p.value, cor.test(score1_BundesLiga, score2_BundesLiga, alternative="two.sided", method = "spearman", exact= FALSE)$p.value, cor.test(score1_BundesLiga, score2_BundesLiga, alternative="two.sided", method = "kendall", exact= FALSE)$p.value)

cor.test_serieA; cor.test_PL; cor.test_Liga; cor.test_BundesLiga


# The partial conclusion is the following: if there is correlation---and this happens for PL, Liga and BundesLiga---the samLigaes may be considered as dependent. But, in each of these three cases, correlation is slightly negative. Then, our conclusion is that bivariate poisson distributions are not well suited for this.


#Let'see now in terms of predictive measure

independence_assumption <- function(league, ngames_train, y_rep1, y_rep2, score1, score2, M){
  
  emp_cor_pearson <- cor(score1, score2, method="pearson")
  emp_cor_kendall <- cor(score1, score2, method="kendall")
  emp_cor_spearman <- cor(score1, score2, method="spearman")
  
  mcmc_cor_pearson = mcmc_cor_kendall = mcmc_cor_spearman = c()
  for (i in 1:M){
  mcmc_cor_pearson[i] <-cor(y_rep1[i,], y_rep2[i,], method="pearson")
  mcmc_cor_kendall[i] <-cor(y_rep1[i,], y_rep2[i,], method="kendall")
  mcmc_cor_spearman[i] <-cor(y_rep1[i,], y_rep2[i,], method="spearman")
  }
  par(mfrow=c(1,3))
  hist(mcmc_cor_pearson, breaks=40, probability = TRUE, 
    main ="Pearson correlation", xlab = expression(rho),
    xlim=c(min(mcmc_cor_pearson, emp_cor_pearson )-0.02,
      max(mcmc_cor_pearson, emp_cor_pearson )+0.02))
  abline(v= emp_cor_pearson, lwd=2, lty=2)
  hist(mcmc_cor_kendall, breaks=40, probability = TRUE, 
    main ="Kendall correlation",xlab=expression(tau),
    xlim=c(min(mcmc_cor_kendall, emp_cor_kendall )-0.02,
      max(mcmc_cor_kendall, emp_cor_kendall )+0.02))
  abline(v= emp_cor_kendall, lwd=2, lty=2)
  hist(mcmc_cor_spearman, breaks=40, probability = TRUE, 
    main ="Spearman correlation", xlab=expression(rho[s]),
    xlim=c(min(mcmc_cor_spearman, emp_cor_spearman )-0.02,
      max(mcmc_cor_spearman, emp_cor_spearman )+0.02))
  abline(v= emp_cor_spearman, lwd=2, lty=2)
}


#save directly from the R window (width:545, height:312)
load("y_rep2Ita.RData")
independence_assumption("Ita", ngames_train = ngames_train_serieA, y_rep1=y_rep1, y_rep2=y_rep2,
score1=score1_serieA, score2=score2_serieA, M=M)

load("y_rep2Eng.RData")
independence_assumption("Eng", ngames_train = ngames_train_PL, y_rep1=y_rep1, y_rep2=y_rep2,
  score1=score1_PL, score2=score2_PL, M=M)

load("y_rep2Spa.RData")
independence_assumption("Spa", ngames_train = ngames_train_Liga, y_rep1=y_rep1, y_rep2=y_rep2,
  score1=score1_Liga, score2=score2_Liga, M=M)

load("y_rep2Ger.RData")
independence_assumption("Ger", ngames_train = ngames_train_BundesLiga, y_rep1=y_rep1, y_rep2=y_rep2,
  score1=score1_BundesLiga, score2=score2_BundesLiga, M=M)






## (B) Draw inflation

#We compute the frequencies


freq_table <- function( dataset, ngames_train, ngames_tot, 
  ngames_test, hat_theta1_prev, hat_theta2_prev) {
  
  freq_insample <- table(dataset[1:ngames_train,5])/ngames_train
  freq_outsample <-
    table(dataset[(ngames_train+1):ngames_tot,5])/(ngames_tot-ngames_train)
  skellam_prob=matrix(NA, ngames_test,3)
  
  for (n in 1:ngames_test)
    skellam_prob[n,]=c(pskellam(0,hat_theta1_prev[n], 
      hat_theta2_prev[n], lower.tail=F),
      dskellam(0, hat_theta1_prev[n], hat_theta2_prev[n] ),
      1-pskellam(0,hat_theta1_prev[n], hat_theta2_prev[n], lower.tail=F)-
        dskellam(0, hat_theta1_prev[n], hat_theta2_prev[n] ) )
  
  results=c()
  results[dataset[,5]=="H"]=1; 
  results[dataset[,5]=="D"]=2; 
  results[dataset[,5]=="A"]=3
  results_test=results[(ngames_train+1):ngames_tot]
  
  
  p_kroenecker_outsample=matrix(NA, ngames_test,3)
  for (h in 1:ngames_test){
    p_kroenecker_outsample[h, results_test[h]]=(  skellam_prob[h,results_test[h]  ])
  }
  average_p_correct_model_outsample=
    apply(p_kroenecker_outsample,2, mean, na.rm=TRUE)
  
  
  return( list( freq_insample = freq_insample, freq_outsample =  freq_outsample,
    average_p_correct_model_outsample= average_p_correct_model_outsample))
}



# New predictive plot ppcheck (see stancon 2017)

score_difference_plot <- function(league, y_rep1, y_rep2, ngames_train, score1, score2 ){
library(matrixStats)


diff_gol=score1-score2
diff_gol_rep=y_rep1-y_rep2

scd <- diff_gol
scd_sims <- diff_gol_rep

scd_hat <- colMedians(scd_sims)
scd_se <- sqrt(colVars(scd_sims))
alpha <- 0.95;
scd_ub <- colQuantiles(scd_sims, probs = 1-(1-alpha)/2)
scd_lb <- colQuantiles(scd_sims, probs = (1-alpha)/2)
ci95 <- sum(scd < scd_ub & scd_lb<scd)/ngames_train
ngames_train_draw <- sum(scd ==0)
scd_draw <- scd[scd==0]
ci95_draw <- sum(scd_draw < scd_ub[scd==0] & scd_lb[scd==0]<scd_draw)/ngames_train_draw
alpha <- 0.5;
scd_ub2 <- colQuantiles(scd_sims, probs = 1-(1-alpha)/2)
scd_lb2 <- colQuantiles(scd_sims, probs = (1-alpha)/2)
ci50 <- sum(scd < scd_ub2 & scd_lb2<scd)/ngames_train
ci50_draw <- sum(scd_draw < scd_ub2[scd==0] & scd_lb2[scd==0]<scd_draw)/ngames_train_draw


sort_scd <- scd[order(scd)]
sort_scd_hat <- scd_hat[order(scd)]
sort_scd_se <- scd_se[order(scd)]
sort_scd_ub <- scd_ub[order(scd)]
sort_scd_lb <- scd_lb[order(scd)]
sort_scd_ub2 <- scd_ub2[order(scd)]
sort_scd_lb2 <- scd_lb2[order(scd)]
df <- data.frame(list(scd = sort_scd, scd_hat = sort_scd_hat, scd_se = sort_scd_se,
  scd_ub = sort_scd_ub, scd_lb = sort_scd_lb, 
  scd_ub2 = sort_scd_ub2, scd_lb2 = sort_scd_lb2))

ggplot(df, aes(x = c(1:ngames_train))) +
  geom_ribbon(aes(ymin = scd_lb,
    ymax = scd_ub),
    fill="#F0E442") + 
  geom_ribbon(aes(ymin = scd_lb2,
    ymax = scd_ub2),
    fill="khaki3") + 
  #geom_line(aes(y=scd_hat),colour="darkred") + 
  #geom_point(aes(y=scd_hat),colour="darkred",shape=4) +
  geom_point(aes(y=scd), size = 0.5, col="blue") +
  scale_x_continuous(name="games") +
  #scale_y_discrete(name="score difference", limits=seq(-8,8)) + 
  scale_y_continuous(name="Goal difference", 
    breaks = c(-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8), 
    sec.axis = dup_axis()) 
#+
 # ggtitle("Estimated score differences (red) with 95% intervals (light yellow),
  #  \n  50% intervals (dark yellow), and the actual score differences (blue)");
ggsave(paste("diffPlot", league, ".pdf", sep=""), height=4.5, width=6)
return(c(ci50, ci95, ci50_draw, ci95_draw))

  
}




load("y_rep2Ita.RData")
ita_diff  <- score_difference_plot("Ita", y_rep1, y_rep2,
   ngames_train_serieA, score1_serieA,
  score2_serieA)

load("y_rep2Eng.RData")
eng_diff <- score_difference_plot("Eng", y_rep1, y_rep2,
  ngames_train_PL, score1_PL,
  score2_PL)

load("y_rep2Spa.RData")
spa_diff <- score_difference_plot("Spa", y_rep1, y_rep2, 
  ngames_train_Liga, score1_Liga,
  score2_Liga)

load("y_rep2Ger.RData")
ger_diff <- score_difference_plot("Ger", y_rep1, y_rep2, 
  ngames_train_BundesLiga, 
  score1_BundesLiga,
  score2_BundesLiga)

# per i draw

xtable (rbind(c( round(ger_diff,4)), c(round(eng_diff,4)), c(round(spa_diff,4)),
  c(round(ita_diff,4))))



## (C) Overdispersion

overdispersion_function <- function(league, y_rep1, y_rep2, ngames_train, score1, score2){
  
  rep1_mean <- apply(y_rep1,1,mean)
  rep2_mean <- apply(y_rep2,1,mean)
  rep1_var <- apply(y_rep1,1,var)
  rep2_var <- apply(y_rep2,1,var)
  
  emp_var1 <- var(score1)
  emp_var2 <- var(score2)
  par(mfrow=c(2,2))
  hist(rep1_var, xlab="Var(y1)", breaks = 40, probability = TRUE, xlim=c(1,3),
    main = "PP check for Var(y1)")
  abline(v= emp_var1, lwd=2, lty =2)
  hist(rep2_var, xlab="Var(y2)", breaks = 40, probability = TRUE, xlim=c(1,2),
    main = "PP check for Var(y2)")
  abline(v= emp_var2, lwd=2, lty =2)
  plot(rep1_mean, rep1_var, main="PP check mean1 vs variance1",
    xlab=expression(theta[m1]), 
    ylab = "Var(y1)", ylim=c(0.5,3))
  abline(a=0, b=1, lty=2, lwd=2, col="red")
  plot(rep2_mean, rep2_var,  main="PP check mean2 vs variance2",
    xlab=expression(theta[m2]), 
    ylab = "Var(y2)", ylim=c(0.5,3))
  abline(a=0, b=1, lty=2, lwd=2, col="red")
  
  
}

# save directly from R window

load("y_rep2Ita.RData")
overdispersion_function("Ita", y_rep1=y_rep1, y_rep2=y_rep2, ngames_train = ngames_train_serieA, 
  score1=score1_serieA, score2=score2_serieA)


load("y_rep2Eng.RData")
overdispersion_function("Ita", y_rep1=y_rep1, y_rep2=y_rep2, ngames_train = ngames_train_PL, 
  score1=score1_PL, score2=score2_PL)

load("y_rep2Spa.RData")
overdispersion_function("Ita", y_rep1=y_rep1, y_rep2=y_rep2, ngames_train = ngames_train_Liga, 
  score1=score1_Liga, score2=score2_Liga)

load("y_rep2Ger.RData")
overdispersion_function("Ita", y_rep1=y_rep1, y_rep2=y_rep2, ngames_train = ngames_train_BundesLiga, 
  score1=score1_BundesLiga, score2=score2_BundesLiga)


## Test checking for the profits


checking_profit <- function(league,dataset,
  previsioni1, previsioni2,ngames_train, ngames_test,ngames_tot, M, array_odds,spesa){
  
  results <- c()
  results[dataset[,5]=="H"] <- 1; 
  results[dataset[,5]=="D"] <- 2; 
  results[dataset[,5]=="A"] <- 3
  results_test=results[(ngames_train+1):ngames_tot]
  
  prev_matrix <- previsioni1-previsioni2
  
  prev_matrix[prev_matrix >0] <-1
  prev_matrix[prev_matrix==0] <-2
  prev_matrix[prev_matrix < 0]<-3
  
  select_function<-function(matrice, string_vector){
    el <-c()
    for (i in 1:length(string_vector)){
    el[i] <- matrice[i, string_vector[i]]
    }
    return(el)
      
  }
  
  
  
  ricavo <- matrix(0, M, agenzie)
  for (i in 1:M){
    df <- (1:ngames_test)[results_test==as.vector(prev_matrix[i,])]
   for (g in 1:agenzie){
         
         ricavo[i,g]<- sum(spesa[df,g]*
             select_function(array_odds[df,g, ],as.vector(prev_matrix[i, df])) )
   }}
  
  guadagno <- matrix(0, M, agenzie)
  for ( i in 1:M )
  guadagno[i,] <- ricavo[i,] -apply(spesa,2,sum)
  
  
  
  prev_median1 <-apply(previsioni1,2,median)
  prev_median2 <-apply(previsioni2,2,median)
  
  prev_vec <- prev_median1-prev_median2
  prev_vec[prev_vec >0] <-1
  prev_vec[prev_vec==0] <-2
  prev_vec[prev_vec < 0]<-3
  
  ricavo2 <- c()
  df2 <- (1:ngames_test)[results_test==as.vector(prev_vec)]
  for (g in 1:agenzie){
    
    ricavo2[g]<- sum(spesa[df2,g]*
        select_function(array_odds[df2,g, ],as.vector(prev_vec[df2])) )
  }
  
  guadagno2 <- c()
  guadagno2 <- ricavo2-apply(spesa,2,sum)
  
          
  return( list(guadagno=guadagno, guadagno2=guadagno2))
  
  
  
}


ita_profit <- checking_profit("Ita", serieA,
  previsioni1_serieA, 
  previsioni2_serieA,
  ngames_train_serieA, 
  ngames_test_serieA,
  ngames_tot_serieA, M, 
  array_odds_serieA[(ngames_train_serieA+1):ngames_tot_serieA,,], 
  spesa = #matrix(1, ngames_test_serieA, agenzie)) 
    serieA_betting$spesa_rue)
plot(density(ita_profit$guadagno[,1], adj=2))
for (j in 2:7){
  lines(density(ita_profit$guadagno[,j], adj=2),col=j)
}

eng_profit <- checking_profit("Eng", PL,
  previsioni1_PL, 
  previsioni2_PL,
  ngames_train_PL, 
  ngames_test_PL,
  ngames_tot_PL, M, 
  array_odds_PL[(ngames_train_PL+1):ngames_tot_PL,,],
  spesa = matrix(1, ngames_test_PL, agenzie)) 
hist(eng_profit, breaks=40, probability = TRUE)

spa_profit <- checking_profit("Spa", Liga,
  previsioni1_Liga, 
  previsioni2_Liga,
  ngames_train_Liga, 
  ngames_test_Liga,
  ngames_tot_Liga, M, 
  array_odds_Liga[(ngames_train_Liga+1):ngames_tot_Liga,,],
  spesa = matrix(1, ngames_test_Liga, agenzie) )
hist(spa_profit, breaks=40, probability = TRUE)

ger_profit <- checking_profit("Eng", BundesLiga,
  previsioni1_BundesLiga, 
  previsioni2_BundesLiga,
  ngames_train_BundesLiga, 
  ngames_test_BundesLiga,
  ngames_tot_BundesLiga, M, 
  array_odds_BundesLiga[(ngames_train_BundesLiga+1):ngames_tot_BundesLiga,,],
  spesa = matrix(1, ngames_test_BundesLiga, agenzie)) 
hist(ger_profit, breaks=40, probability = TRUE)






